library(data.table)
library(pdftools)
library(tokenizers)
library(tidytext)

# This script calculates Chi2 and relative usage statistics for terms in
# Beveridge's analysis of unemployment relative to writings supportive of the
# existing welfare system

setwd("~/Dropbox (Personal)/Trade shocks 1900s/Replication Archive")

# texts supportive of the attitudes of the old poor law
old_texts <- c('Bosanquet summary of poor law report.pdf',
    'Criticism of poor law minority report.pdf',
    'Montague old poor law new socialism.pdf',
    'Senior Poor Law.pdf',
    'Smiles Self Help.pdf',
    'Vagrancy problem.pdf')

# texts supportive of poor law reform
new_texts <- c('Beveridge Unemployment.pdf'
)

old_texts <- paste0("Data/Books for Beveridge terms/", old_texts)
new_texts <- paste0("Data/Books for Beveridge terms/", new_texts)
old_texts

get_sentence_vec <- function(pdf_location) {
    pdf_string <- pdf_text(pdf_location)
    pdf_string <- pdf_string[5:(length(pdf_string) - 4)]
    sentence_vec <- unlist(tokenize_sentences(pdf_string))
    rm(pdf_string)
    return(sentence_vec)
}

get_sentence_vec_from_pdf_vec <- function(pdf_location_vec) {
    sentence_vecs <- sapply(pdf_location_vec, get_sentence_vec)
    sentence_vec <- unlist(sentence_vecs)
}

old_vec <- get_sentence_vec_from_pdf_vec(old_texts)
head(old_vec)
length(old_vec)
new_vec <- get_sentence_vec_from_pdf_vec(new_texts)
length(new_vec)
new_vec <- new_vec[, 1]
head(new_vec)

# gentzkow and shapiro 2010 ECMA method for identifying distinguishing phrases
gs_chi2 <- function(df, group_var, text_var = "text"){
    # takes a df with text (n-gram), and group columns
    df$group <- df[, ..group_var]
    df$text <- df[, ..text_var]

    df_word_counts <- dcast(df, text ~ group,
        fun.aggregate = length)

    colnames(df_word_counts) <- c("word", "n_word_other", "n_word_group")

    df_word_counts[is.na(n_word_other), n_word_other := 0]
    df_word_counts[is.na(n_word_group), n_word_group := 0]

    df_word_counts[, `:=`(n_word_other = as.numeric(n_word_other),
        n_word_group = as.numeric(n_word_group))]


    df_word_counts[, n_all_word_group := sum(n_word_group)]
    df_word_counts[, n_all_word_other := sum(n_word_other)]
    df_word_counts[, n_other_word_group := n_all_word_group - n_word_group]
    df_word_counts[, n_other_word_other := n_all_word_other - n_word_other]

    df_word_counts[, chi2 :=
        (n_word_group * n_other_word_other - n_word_other * n_other_word_group) ^ 2 /
        ( (n_word_group + n_word_other) * (n_word_group + n_other_word_group) *
        (n_word_other + n_other_word_other) * (n_other_word_group + n_other_word_other) )]
    setorder(df_word_counts, -chi2)
    df_word_counts[, group_rel :=
        (n_word_group / n_all_word_group - n_word_other / n_all_word_other) /
        ((n_word_group + n_word_other) / (n_all_word_group + n_all_word_other))]
    return(df_word_counts[, .(word, chi2, group_rel)])
}

tot_vec <- c(old_vec, new_vec)
new <- c(rep(0, length(old_vec)), rep(1, length(new_vec)))
head(tot_vec)
length(names(old_vec))
length(old_vec)
length(names(new_vec))
df_text <- data.table(text = c(old_vec, new_vec),
    files = c(names(old_vec), rep("Beveridge", length(new_vec))),
    group = new)
head(df_text)
df_text <- unnest_tokens(df_text, "text", "text", token = "ngrams", n = 1)

head(df_text)
# calculate Chi2 and relative frequencies
df_gs <- gs_chi2(df_text, "group", "text")
# order by Chi2
setorder(df_gs, -chi2)
# print terms with highest Chi2 for which relative frequency is greater than 3
print(df_gs[group_rel > 3][1:30], nrow = 30)
